root.dir <- "~/OneDrive/projects/"
project.dir <- paste0(root.dir,"PDX/")
source(paste0(root.dir,"R.utils/RNASEQ.utils.R"))
source(paste0(project.dir,"src/load.PDX.R"))
library(iC10)
library(genefu)
library(preprocessCore)
ensembl.human = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ensembl.mouse = useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl")
rnaseq <- get.PDX()
|--------------------------------------------------|
|==================================================|
meta <- load.meta()
meta <- subset.meta(meta,meta[,project=="characterisation"])
rnaseq <- subset.PDX(rnaseq, rnaseq$meta[,Sample.name] %in% meta[,Sample.name])
rnaseq$meta[Sample.name=="AB793-T1-T" & Pool=="SLX-12501",Sample.name:=paste0(Sample.name,".R2")]
meta <- rbind(meta,meta[Sample.name=="AB793-T1-T"])
meta[nrow(meta),Sample.name:=paste0(Sample.name,".R2")]
rnaseq <- merge.PDX(rnaseq)
#rnaseq$meta[,merged.lanes:=as.numeric(as.factor(merged.lanes))]
rnaseq$meta <- merge(rnaseq$meta,meta,sort=F)
rnaseq <- subset.PDX(rnaseq, rnaseq$meta[,human.library.size > 1e6])
rnaseq <- subset.PDX(rnaseq, rnaseq$meta[,mouse.library.size/(human.library.size+mouse.library.size) < .8])
rnaseq <- subset.PDX(rnaseq, rnaseq$meta[,sample.type!="cell.line"])
pca.human <- pca.run(rnaseq$human$data)
ggpairs(cbind(pca.human,rnaseq$meta),columns = 1:3,mapping = aes(color=factor(merged.lanes)),title="Coloured by sequencing runs")
ggpairs(cbind(pca.human,rnaseq$meta),columns = 1:3,mapping = aes(color=factor(sample.type)),title="Coloured by sample type")
ggpairs(cbind(pca.human,rnaseq$meta),columns = 1:3,mapping = aes(color=cut(mouse.library.size/(human.library.size+mouse.library.size),quantile(mouse.library.size/(human.library.size+mouse.library.size)))),title="Coloured by mouse fraction")
x <- cbind(pca.human,rnaseq$meta)
x[sample.type=="xenograft" & PC1>-2 & PC2>5,.(Sample.name,merged.lanes,PC1)]
Sample.name merged.lanes PC1
1: AB642-M1-X01-00.020706-T SLX-12501:HFV2VBBXX:s_5 19.830498
2: AB613-M1-X01-00.011825-T SLX-12501:HFV2VBBXX:s_5 23.837450
3: NKI250-T1-X01-00.018782-T SLX-12501:HFV2VBBXX:s_5 18.600173
4: NKI250-T1-X00-00.037109-T SLX-12501:HFV2VBBXX:s_5 14.859144
5: NKI336-M1-X01-00.024208-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 5.181548
6: AB702-M1-X00-14.030064-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 20.228967
7: NKI302-T1-X00-00.035209-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 18.302795
8: AB613-M1-X00-00.002369-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 25.709310
9: NKI302-T1-X01-00.022813-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 14.336896
10: AB635-T1-X00-00.000000-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 5.811192
11: AB405-T1-X00-00.022964-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 21.278972
x[sample.type=="primary" & PC1< -2 & PC2>5,.(Sample.name,merged.lanes,PC1)]
Sample.name merged.lanes PC1
1: AB790-T1-T SLX-12501:HFV2VBBXX:s_5 -11.89318
2: AB642-M1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -17.43823
3: AB692-M1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -18.37213
4: AB692-T1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -19.14371
5: AB786-T1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -10.39128
6: AB641-T1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -15.77172
7: AB638-T1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -16.80436
8: AB725-T1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -18.10548
9: AB612-T1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -15.80014
10: AB710-T1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -18.29053
11: AB694-T1-T SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7 -17.90930
(y <- x[merged.lanes %in% c("SLX-12501:HFV2VBBXX:s_5"),.(Sample.name,sample.type,primary.on.pca=PC1>-2,PC1)])
Sample.name sample.type primary.on.pca PC1
1: AB590-T1-X00-00.000000-T xenograft FALSE -9.159861
2: AB636-M1-T primary TRUE 26.548445
3: AB793-T1-T.R2 primary TRUE 17.677395
4: AB642-M1-X01-00.020706-T xenograft TRUE 19.830498
5: AB613-M1-X01-00.011825-T xenograft TRUE 23.837450
6: NKI250-T1-X01-00.018782-T xenograft TRUE 18.600173
7: AB677-T1-T primary TRUE 25.136646
8: NKI250-T1-X00-00.037109-T xenograft TRUE 14.859144
9: AB635-M1-T primary TRUE 23.079182
10: AB693-T1-T primary TRUE 9.259491
11: AB768-T1-T primary TRUE 5.820192
12: AB694-T1-X00-00.027099-T xenograft FALSE -8.685637
13: AB790-T1-T primary FALSE -11.893180
14: AB767-T1-T primary TRUE 12.480339
y[,.N,.(sample.type,primary.on.pca)]
sample.type primary.on.pca N
1: xenograft FALSE 2
2: primary TRUE 7
3: xenograft TRUE 4
4: primary FALSE 1
(y <- x[merged.lanes %in% c("SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7"),.(Sample.name,sample.type,primary.on.pca=PC1>-2,PC1)])
Sample.name sample.type primary.on.pca PC1
1: AB577-M1-X01-00.000000-T xenograft FALSE -19.8660467
2: NKI336-T1-X00-00.035208-T xenograft FALSE -16.3855562
3: AB642-M1-T primary FALSE -17.4382318
4: AB636-M1-X00-00.000000-T xenograft FALSE -15.9302111
5: AB692-M1-T primary FALSE -18.3721287
6: AB692-T1-T primary FALSE -19.1437052
7: AB786-T1-T primary FALSE -10.3912754
8: AB641-T1-T primary FALSE -15.7717243
9: AB638-T1-T primary FALSE -16.8043639
10: NKI336-M1-X01-00.024208-T xenograft TRUE 5.1815485
11: AB793-T1-T primary TRUE 13.6897316
12: AB405-T1-T primary TRUE 7.5516926
13: AB638-T1-X00-00.000000-T xenograft FALSE -15.2883010
14: NKI127-T1-X00-00.035207-T xenograft FALSE -18.8150188
15: AB785-T1-T primary TRUE 0.1140427
16: AB725-T1-T primary FALSE -18.1054783
17: AB802-T1-T primary TRUE 14.5952252
18: AB613-M1-T primary TRUE 8.5910833
19: AB590-T1-T primary TRUE 24.7826696
20: AB702-M1-X00-14.030064-T xenograft TRUE 20.2289670
21: AB612-T1-T primary FALSE -15.8001431
22: NKI302-T1-X00-00.035209-T xenograft TRUE 18.3027953
23: AB769-T1-T primary TRUE 21.8949170
24: AB702-M1-T primary TRUE 24.2251866
25: AB613-M1-X00-00.002369-T xenograft TRUE 25.7093101
26: AB710-T1-T primary FALSE -18.2905334
27: AB694-T1-T primary FALSE -17.9092967
28: NKI302-T1-X01-00.022813-T xenograft TRUE 14.3368963
29: AB635-T1-X00-00.000000-T xenograft TRUE 5.8111919
30: AB405-T1-X00-00.022964-T xenograft TRUE 21.2789715
Sample.name sample.type primary.on.pca PC1
y[,.N,.(sample.type,primary.on.pca)]
sample.type primary.on.pca N
1: xenograft FALSE 5
2: primary FALSE 10
3: xenograft TRUE 7
4: primary TRUE 8
rnaseq <- subset.PDX(rnaseq, rnaseq$meta[,!(merged.lanes %in% c("SLX-12501:HFV2VBBXX:s_5","SLX-12507:HFV2VBBXX:s_6;SLX-12507:HFV2VBBXX:s_7"))])
rnaseq$meta[,merged.lanes:=as.numeric(as.factor(merged.lanes))]
pca.human <- pca.run(rnaseq$human$data,rename = T)
pca.human <- cbind(pca.human,rnaseq$meta)
ggpairs(pca.human,columns = 1:3,mapping = aes(color=factor(merged.lanes)),title="Coloured by sequencing runs")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=factor(sample.type)),title="Coloured by sample type")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=ER_PAT),title="Coloured by ER_PAT status")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=ER_IHC_PDX),title="Coloured by ER_IHC_PDX status")
We see that while some batch effect remains between the samples, the major drivers of variability is xenograft vs primary alongside ER status. For the latter of these, we see more seperation by ER in xenographs, though this could be due to the biased composition of the cohort.
for(stype in unique(rnaseq$meta$sample.type)){
x <- subset.PDX(rnaseq,rnaseq$meta$sample.type == stype)
pca.human <- pca.run(x$human$data,rename = T)
pca.human <- cbind(pca.human,x$meta)
print(ggpairs(pca.human,columns = 1:3,mapping = aes(color=factor(merged.lanes)),title=paste(stype,"Coloured by sequencing runs")))
print(ggpairs(pca.human,columns = 1:3,mapping = aes(color=ER_PAT),title=paste(stype,"Coloured by ER_PAT status")))
if(stype == "xenograft") print(ggpairs(pca.human,columns = 1:3,mapping = aes(color=ER_IHC_PDX),title=paste(stype,"Coloured by ER_IHC_PDX status")))
}
We still don’t see seperation by ER status in the primary samples when we analyse them independently of the xenografts. As to the xenograft samples, we see clearly the batch effect by (sequencing) run. We also see seperation due to ER status. Curriously, some samples do not cluster with the ER status they were found to have by IHC.
x <- subset.PDX(rnaseq,rnaseq$meta$sample.type == "xenograft")
pca.human <- pca.run(x$human$data,rename = T)
pca.human <- cbind(pca.human,x$meta)
pca.human[ER_PAT != "" & ER_IHC_PDX != "",sum(ER_PAT!=ER_IHC_PDX)/.N]
[1] 0.1702128
set.seed(1)
pca.human$er.pca <- as.factor(kmeans(pca.human[,1],2)$cluster)
levels(pca.human$er.pca) <- c("NEG","POS")
table(pca.human[ER_PAT != "",.(er.pca,ER_PAT)])
ER_PAT
er.pca NEG POS
NEG 49 13
POS 4 29
table(pca.human[ER_IHC_PDX %in% c("POS","NEG"),.(er.pca,ER_IHC_PDX)])
ER_IHC_PDX
er.pca NEG POS
NEG 52 7
POS 8 27
17% of samples change ER status according to IHC. 83% accuracy w.r.t IHC on patients. 86% accuracy w.r.t IHC on xenografts.
Unsuprisingly, the testing on xenografts works better, though an error rate greater than 10% is till concerning.
x <- subset.PDX(rnaseq,rnaseq$meta$sample.type == "xenograft")
pca.human <- pca.run(x$human$data)
set.seed(1)
x$meta$er.pca <- as.factor(kmeans(pca.human[,1],2)$cluster)
levels(x$meta$er.pca) <- c("NEG","POS")
y <- norm.count.matrix(x$human$data,lib.sizes = colSums(x$human$data))
x$meta$ER.exp <- y[which(rownames(y) == "ENSG00000091831"),]
x <- x$meta[,.(Sample.name,ER.exp,er.pca,ER_PAT,ER_IHC_PDX)]
x <- melt(x,id.vars = 1:2)
ggplot(x[value %in% c("NEG","POS")], aes(x=variable,y=ER.exp,colour=value)) + geom_boxplot() + geom_point() +theme_tufte()
ER expression does not exclusively drive the clustering, as we see many samples with low ER expression in the “NEG” cluster. However, it should be noted the samples that express ER differently to that found in ER_IHC_PDX.
print("False neg:")
[1] "False neg:"
x[variable=="ER_IHC_PDX" & value=="NEG"][order(-ER.exp)][1]
Sample.name ER.exp variable value
1: AB764-T1-P01-.35287-T1-FF1 7.041696 ER_IHC_PDX NEG
print("False pos:")
[1] "False pos:"
x[variable=="ER_IHC_PDX" & value=="POS"][order(ER.exp)][1:2]
Sample.name ER.exp variable value
1: AB892-T1-P00-.35365-T1-FF1 -0.2773063 ER_IHC_PDX POS
2: CAM2004T2-T2-P01-.29055-T1-FF1 -0.2327316 ER_IHC_PDX POS
x <- subset.PDX(rnaseq,rnaseq$meta$sample.type == "xenograft")
y <- pca.prep(x$human$data,T,1000,colSums(x$human$data))
y <- prcomp(t(y),scale=T)
pc1 <- y$rotation[,1]
pc1 <- data.table(ENSEMBL=names(pc1),value=pc1)
pc1 <- data.table(merge(biomaRt::select(org.Hs.eg.db,keys=pc1$ENSEMBL,keytype = "ENSEMBL",
columns = c("ENSEMBL","ENTREZID","SYMBOL","GENENAME"),all.x=T),pc1))
'select()' returned 1:many mapping between keys and columns
pc1[order(-abs(value))][1:20,.(SYMBOL,GENENAME,value)]
SYMBOL GENENAME value
1: TFF3 trefoil factor 3 0.05844562
2: BCL11A BAF chromatin remodeling complex subunit BCL11A -0.05802838
3: FBP1 fructose-bisphosphatase 1 0.05726330
4: CALD1 caldesmon 1 -0.05686603
5: KRT5 keratin 5 -0.05667319
6: TFF1 trefoil factor 1 0.05659660
7: CAV1 caveolin 1 -0.05642500
8: MAML2 mastermind like transcriptional coactivator 2 -0.05624191
9: ABCC11 ATP binding cassette subfamily C member 11 0.05619574
10: SFRP1 secreted frizzled related protein 1 -0.05593166
11: DZIP1 DAZ interacting zinc finger protein 1 -0.05589324
12: RHOH ras homolog family member H 0.05560767
13: BCAS1 breast carcinoma amplified sequence 1 0.05541772
14: B3GNT5 UDP-GlcNAc:betaGal beta-1,3-N-acetylglucosaminyltransferase 5 -0.05508903
15: SERPINB5 serpin family B member 5 -0.05453957
16: FSCN1 fascin actin-bundling protein 1 -0.05452831
17: CAV2 caveolin 2 -0.05430549
18: GASK1B golgi associated kinase 1B 0.05406583
19: RGMA repulsive guidance molecule BMP co-receptor a -0.05401768
20: PM20D2 peptidase M20 domain containing 2 -0.05373181
No one marker gene dominates PC1. For reference, ESR1 has rank 88.
y <- norm.count.matrix(rnaseq$human$data,lib.sizes = colSums(rnaseq$human$data))
x <- cbind(rnaseq$meta,ER.exp = y[which(rownames(y) == "ENSG00000091831"),])
ggplot(x,aes(x=sample.type,y=ER.exp,color=factor(kmeans(x$ER.exp,2)$cluster))) + geom_jitter(width=.2)
Kmeans method can be expanded to include promary samples.
Summary: - Batch effect due to sequencing to correct. - Xenograft/Primary and ER.status drives variability.
x <- log(calculate_tpm(rnaseq$human$data,rnaseq$human$gene.meta$Length)+0.5)
y <- cbind(rnaseq$meta,ER.exp=x[which(rownames(x) == "ENSG00000091831"),])
ggplot(y) + aes(x=reorder(Sample.name,ER.exp),y=ER.exp,color=ER_IHC_PDX) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank())
ggplot(y) + aes(x=reorder(Sample.name,ER.exp),y=ER.exp,color=ER_PAT) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank())
Ideally we would do the assignment of pos/neg within lanes, but that isn’t possible due to some lanes containing only POS.
library(pROC)
x <- log(calculate_tpm(rnaseq$human$data,rnaseq$human$gene.meta$Length)+0.5)
y <- cbind(rnaseq$meta,ER.exp=x[which(rownames(x) == "ENSG00000091831"),])
roc.pdx <- y[ER_IHC_PDX %in% c("POS","NEG") & sample.type=="xenograft",roc(ER_IHC_PDX,ER.exp)]
Setting levels: control = NEG, case = POS
Setting direction: controls < cases
plot(roc.pdx)
(er.thresh.pdx <- data.table(t(coords(roc.pdx,"all",ret=c("threshold","accuracy"))))[which.max(accuracy),threshold])
An upcoming version of pROC will set the 'transpose' argument to FALSE by default. Set transpose = TRUE explicitly to keep the current behavior, or transpose = FALSE to adopt the new one and silence this warning. Type help(coords_transpose) for additional information.
[1] 3.168585
roc.pri <- y[ER_PAT %in% c("POS","NEG") & sample.type=="primary",roc(ER_PAT,ER.exp)]
Setting levels: control = NEG, case = POS
Setting direction: controls < cases
plot(roc.pri)
(er.thresh.pri <- data.table(t(coords(roc.pri,"all",ret=c("threshold","accuracy"))))[which.max(accuracy),threshold])
An upcoming version of pROC will set the 'transpose' argument to FALSE by default. Set transpose = TRUE explicitly to keep the current behavior, or transpose = FALSE to adopt the new one and silence this warning. Type help(coords_transpose) for additional information.
[1] 0.7195949
ggplot(y) + aes(x=reorder(Sample.name,ER.exp),y=ER.exp,color=ER_IHC_PDX) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank()) + geom_hline(aes(yintercept=er.thresh.pdx))
ggplot(y) + aes(x=reorder(Sample.name,ER.exp),y=ER.exp,color=ER_PAT) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank()) + geom_hline(aes(yintercept=er.thresh.pri))
y[,er.inferred:=ifelse(sample.type=="xenograft", ifelse(ER.exp>er.thresh.pdx,"POS","NEG"), ifelse(ER.exp>er.thresh.pri,"POS","NEG"))]
ggplot(y) + aes(x=reorder(Sample.name,ER.exp),y=ER.exp,color=er.inferred) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank())
rnaseq$meta$er.inferred <- y$er.inferred
x <- log(calculate_tpm(rnaseq$human$data,rnaseq$human$gene.meta$Length)+0.5)
y <- cbind(rnaseq$meta,HER2.exp=x[which(rownames(x) == "ENSG00000141736"),])
ggplot(y) + aes(x=reorder(Sample.name,HER2.exp),y=HER2.exp,color=HER2_PDX) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank())
ggplot(y) + aes(x=reorder(Sample.name,HER2.exp),y=HER2.exp,color=HER2_PAT) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank())
Ideally we would do the assignment of pos/neg within lanes, but that isn’t possible due to some lanes containing only POS.
library(pROC)
roc.pdx <- y[HER2_PDX %in% c("POS","NEG") & sample.type=="xenograft",roc(HER2_PDX,HER2.exp)]
Setting levels: control = NEG, case = POS
Setting direction: controls < cases
plot(roc.pdx)
(HER2.thresh.pdx <- data.table(t(coords(roc.pdx,"all",ret=c("threshold","accuracy"))))[which.max(accuracy),threshold])
An upcoming version of pROC will set the 'transpose' argument to FALSE by default. Set transpose = TRUE explicitly to keep the current behavior, or transpose = FALSE to adopt the new one and silence this warning. Type help(coords_transpose) for additional information.
[1] 6.010892
roc.pri <- y[HER2_PAT %in% c("POS","NEG") & sample.type=="primary",roc(HER2_PAT,HER2.exp)]
Setting levels: control = NEG, case = POS
Setting direction: controls < cases
plot(roc.pri)
(HER2.thresh.pri <- data.table(t(coords(roc.pri,"all",ret=c("threshold","accuracy"))))[which.max(accuracy),threshold])
An upcoming version of pROC will set the 'transpose' argument to FALSE by default. Set transpose = TRUE explicitly to keep the current behavior, or transpose = FALSE to adopt the new one and silence this warning. Type help(coords_transpose) for additional information.
[1] 5.27486
ggplot(y) + aes(x=reorder(Sample.name,HER2.exp),y=HER2.exp,color=HER2_PDX) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank()) + geom_hline(aes(yintercept=HER2.thresh.pdx))
ggplot(y) + aes(x=reorder(Sample.name,HER2.exp),y=HER2.exp,color=HER2_PAT) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank()) + geom_hline(aes(yintercept=HER2.thresh.pri))
y[,HER2.inferred:=ifelse(sample.type=="xenograft", ifelse(HER2.exp>HER2.thresh.pdx,"POS","NEG"), ifelse(HER2.exp>HER2.thresh.pri,"POS","NEG"))]
ggplot(y) + aes(x=reorder(Sample.name,HER2.exp),y=HER2.exp,color=HER2.inferred) + geom_point() + facet_grid(sample.type~merged.lanes) + theme(axis.text.x = element_blank())
rnaseq$meta$HER2.inferred <- y$HER2.inferred
x <- log(calculate_tpm(rnaseq$human$data,rnaseq$human$gene.meta$Length)+0.5)
pca.human <- pca.run(x,normalise = F)
pca.human <- cbind(pca.human,rnaseq$meta)
ggpairs(pca.human,columns = 1:3,mapping = aes(color=factor(merged.lanes)),title="Coloured by sequencing runs")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=factor(sample.type)),title="Coloured by sample type")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=ER_PAT),title="Coloured by ER_PAT status")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=ER_IHC_PDX),title="Coloured by ER_IHC_PDX status")
x <- log(calculate_tpm(rnaseq$human$data,rnaseq$human$gene.meta$Length)+0.5)
sequencing <- rnaseq$meta[,merged.lanes]
sample.type <- rnaseq$meta[,sample.type]
er.type <- rnaseq$meta$er.inferred
her2.type <- rnaseq$meta$HER2.inferred
d <- model.matrix(~ 0 + sample.type + er.type + her2.type)
x <- removeBatchEffect(x,batch = sequencing,design = d)
pca.human <- pca.run(x,normalise = F,rename = T)
pca.human <- cbind(pca.human,rnaseq$meta)
ggpairs(pca.human,columns = 1:3,mapping = aes(color=factor(merged.lanes)),title="Coloured by sequencing runs")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=factor(sample.type)),title="Coloured by sample type")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=ER_PAT),title="Coloured by ER_PAT status")
ggpairs(pca.human,columns = 1:3,mapping = aes(color=ER_IHC_PDX),title="Coloured by ER_IHC_PDX status")
keyMap <- data.table(getBM(
attributes=c('mgi_symbol','ensembl_gene_id'),
filters = 'ensembl_gene_id',
values = rownames(rnaseq.TPMs$mouse$TPMs.correct),
mart = ensembl.mouse
))
Batch submitting query [=>-----------------------------------------------------------------------------------------] 2% eta: 2m
Batch submitting query [==>----------------------------------------------------------------------------------------] 3% eta: 4m
Batch submitting query [===>---------------------------------------------------------------------------------------] 4% eta: 3m
Batch submitting query [====>--------------------------------------------------------------------------------------] 5% eta: 4m
Batch submitting query [====>--------------------------------------------------------------------------------------] 6% eta: 5m
Batch submitting query [=====>-------------------------------------------------------------------------------------] 7% eta: 4m
Batch submitting query [======>------------------------------------------------------------------------------------] 8% eta: 5m
Batch submitting query [=======>-----------------------------------------------------------------------------------] 9% eta: 5m
Batch submitting query [========>----------------------------------------------------------------------------------] 10% eta: 5m
Batch submitting query [=========>---------------------------------------------------------------------------------] 11% eta: 5m
Batch submitting query [==========>--------------------------------------------------------------------------------] 12% eta: 5m
Batch submitting query [===========>-------------------------------------------------------------------------------] 13% eta: 5m
Batch submitting query [============>------------------------------------------------------------------------------] 14% eta: 5m
Batch submitting query [=============>-----------------------------------------------------------------------------] 15% eta: 5m
Batch submitting query [==============>----------------------------------------------------------------------------] 16% eta: 5m
Batch submitting query [==============>----------------------------------------------------------------------------] 17% eta: 5m
Batch submitting query [===============>---------------------------------------------------------------------------] 18% eta: 5m
Batch submitting query [================>--------------------------------------------------------------------------] 19% eta: 4m
Batch submitting query [=================>-------------------------------------------------------------------------] 20% eta: 4m
Batch submitting query [==================>------------------------------------------------------------------------] 21% eta: 4m
Batch submitting query [===================>-----------------------------------------------------------------------] 22% eta: 4m
Batch submitting query [====================>----------------------------------------------------------------------] 23% eta: 4m
Batch submitting query [=====================>---------------------------------------------------------------------] 24% eta: 4m
Batch submitting query [======================>--------------------------------------------------------------------] 25% eta: 4m
Batch submitting query [=======================>-------------------------------------------------------------------] 26% eta: 4m
Batch submitting query [========================>------------------------------------------------------------------] 27% eta: 4m
Batch submitting query [========================>------------------------------------------------------------------] 28% eta: 3m
Batch submitting query [=========================>-----------------------------------------------------------------] 29% eta: 3m
Batch submitting query [==========================>----------------------------------------------------------------] 30% eta: 3m
Batch submitting query [===========================>---------------------------------------------------------------] 31% eta: 3m
Batch submitting query [============================>--------------------------------------------------------------] 32% eta: 3m
Batch submitting query [=============================>-------------------------------------------------------------] 33% eta: 3m
Batch submitting query [==============================>------------------------------------------------------------] 34% eta: 3m
Batch submitting query [===============================>-----------------------------------------------------------] 35% eta: 3m
Batch submitting query [================================>----------------------------------------------------------] 36% eta: 3m
Batch submitting query [=================================>---------------------------------------------------------] 37% eta: 3m
Batch submitting query [==================================>--------------------------------------------------------] 38% eta: 3m
Batch submitting query [==================================>--------------------------------------------------------] 39% eta: 3m
Batch submitting query [===================================>-------------------------------------------------------] 40% eta: 3m
Batch submitting query [====================================>------------------------------------------------------] 41% eta: 3m
Batch submitting query [=====================================>-----------------------------------------------------] 42% eta: 3m
Batch submitting query [======================================>----------------------------------------------------] 43% eta: 3m
Batch submitting query [=======================================>---------------------------------------------------] 44% eta: 2m
Batch submitting query [========================================>--------------------------------------------------] 45% eta: 2m
Batch submitting query [=========================================>-------------------------------------------------] 46% eta: 2m
Batch submitting query [==========================================>------------------------------------------------] 47% eta: 2m
Batch submitting query [===========================================>-----------------------------------------------] 48% eta: 2m
Batch submitting query [============================================>----------------------------------------------] 49% eta: 2m
Batch submitting query [=============================================>---------------------------------------------] 50% eta: 2m
Batch submitting query [=============================================>---------------------------------------------] 51% eta: 2m
Batch submitting query [==============================================>--------------------------------------------] 52% eta: 2m
Batch submitting query [===============================================>-------------------------------------------] 53% eta: 2m
Batch submitting query [================================================>------------------------------------------] 54% eta: 2m
Batch submitting query [=================================================>-----------------------------------------] 55% eta: 2m
Batch submitting query [==================================================>----------------------------------------] 56% eta: 2m
Batch submitting query [===================================================>---------------------------------------] 57% eta: 2m
Batch submitting query [====================================================>--------------------------------------] 58% eta: 2m
Batch submitting query [=====================================================>-------------------------------------] 59% eta: 2m
Batch submitting query [======================================================>------------------------------------] 60% eta: 2m
Batch submitting query [=======================================================>-----------------------------------] 61% eta: 2m
Batch submitting query [=======================================================>-----------------------------------] 62% eta: 2m
Batch submitting query [========================================================>----------------------------------] 63% eta: 2m
Batch submitting query [=========================================================>---------------------------------] 64% eta: 2m
Batch submitting query [==========================================================>--------------------------------] 65% eta: 2m
Batch submitting query [===========================================================>-------------------------------] 66% eta: 2m
Batch submitting query [============================================================>------------------------------] 67% eta: 2m
Batch submitting query [=============================================================>-----------------------------] 68% eta: 2m
Batch submitting query [==============================================================>----------------------------] 69% eta: 1m
Batch submitting query [===============================================================>---------------------------] 70% eta: 1m
Batch submitting query [================================================================>--------------------------] 71% eta: 1m
Batch submitting query [=================================================================>-------------------------] 72% eta: 1m
Batch submitting query [=================================================================>-------------------------] 73% eta: 1m
Batch submitting query [==================================================================>------------------------] 74% eta: 1m
Batch submitting query [===================================================================>-----------------------] 75% eta: 1m
Batch submitting query [====================================================================>----------------------] 76% eta: 1m
Batch submitting query [=====================================================================>---------------------] 77% eta: 1m
Batch submitting query [======================================================================>--------------------] 78% eta: 1m
Batch submitting query [=======================================================================>-------------------] 79% eta: 1m
Batch submitting query [========================================================================>------------------] 80% eta: 1m
Batch submitting query [=========================================================================>-----------------] 81% eta: 1m
Batch submitting query [==========================================================================>----------------] 82% eta: 50s
Batch submitting query [===========================================================================>---------------] 83% eta: 46s
Batch submitting query [===========================================================================>---------------] 84% eta: 43s
Batch submitting query [============================================================================>--------------] 85% eta: 40s
Batch submitting query [=============================================================================>-------------] 86% eta: 38s
Batch submitting query [==============================================================================>------------] 87% eta: 35s
Batch submitting query [===============================================================================>-----------] 88% eta: 32s
Batch submitting query [================================================================================>----------] 89% eta: 29s
Batch submitting query [=================================================================================>---------] 90% eta: 27s
Batch submitting query [==================================================================================>--------] 91% eta: 24s
Batch submitting query [===================================================================================>-------] 92% eta: 21s
Batch submitting query [====================================================================================>------] 93% eta: 18s
Batch submitting query [=====================================================================================>-----] 94% eta: 15s
Batch submitting query [=====================================================================================>-----] 95% eta: 13s
Batch submitting query [======================================================================================>----] 96% eta: 10s
Batch submitting query [=======================================================================================>---] 97% eta: 8s
Batch submitting query [========================================================================================>--] 98% eta: 5s
Batch submitting query [=========================================================================================>-] 99% eta: 3s
Batch submitting query [===========================================================================================] 100% eta: 0s
i <- rnaseq$meta[,which(patient=="AB559")]
j <- rowSums(rnaseq.TPMs$human$TPMs.correct[,i]==0) != 2
quantile(apply(rnaseq.TPMs$human$TPMs.correct[j,i],1,diff))
0% 25% 50% 75% 100%
-3.19839173 -0.01725355 0.00000000 0.03110988 3.37223850
cor(rnaseq.TPMs$human$TPMs.correct[,i])
AB559-T1-x01-00-001490-T-R1 AB559-T1-x01-00-001490-T-R2
AB559-T1-x01-00-001490-T-R1 1.0000000 0.9833126
AB559-T1-x01-00-001490-T-R2 0.9833126 1.0000000
boxplot(cor(rnaseq.TPMs$human$TPMs.correct))
fwrite(rnaseq.TPMs$meta,"~/Desktop/PDX.rnaseq.char.sample_meta.csv")
#fwrite(rnaseq.TPMs$human$humangene.meta,"~/Desktop/PDX.rnaseq.char.human.gene_meta.csv")
fwrite(as.data.table(rnaseq.TPMs$human$TPMs.correct,keep.rownames = "Geneid"),"~/Desktop/PDX.rnaseq.char.human.TPMs_corrected.csv")
#fwrite(as.data.table(rnaseq.TPMs$data.TPMs,keep.rownames = "Geneid"),"~/Desktop/PDX.rnaseq.char.mouse.TPMs.csv")
fwrite(as.data.table(rnaseq.TPMs$mouse$TPMs.correct,keep.rownames = "Geneid"),"~/Desktop/PDX.rnaseq.char.mouse.TPMs_corrected.csv")
x <- log(calculate_tpm(rnaseq$human$data,rnaseq$human$gene.meta$Length)+0.5)
set.seed(1); er.type <- kmeans(x[which(rownames(x) == "ENSG00000091831"),],2)$cluster
sequencing <- rnaseq$meta[,merged.lanes]
sample.type <- rnaseq$meta[,sample.type]
d <- model.matrix(~ 0 + sample.type + er.type)
rnaseq.TPMs <- list(
meta = rnaseq$meta,
human = list(
gene.meta = rnaseq$human$gene.meta,
TPMs = calculate_tpm(rnaseq$human$data,
rnaseq$human$gene.meta$Length,
ls=colSums(rnaseq$human$data) + colSums(rnaseq$mouse$data))
),
mouse = list(
gene.meta = rnaseq$mouse$gene.meta,
TPMs = calculate_tpm(rnaseq$mouse$data,
rnaseq$mouse$gene.meta$Length,
ls=colSums(rnaseq$human$data) + colSums(rnaseq$mouse$data))
)
)
rnaseq.TPMs$human$TPMs.correct <- removeBatchEffect(log(rnaseq.TPMs$human$TPMs+0.5),batch = sequencing,design = d)
rnaseq.TPMs$mouse$TPMs.correct <- removeBatchEffect(log(rnaseq.TPMs$mouse$TPMs+0.5),batch = sequencing,design = d)
keyMap <- data.table(getBM(
attributes=c('hgnc_symbol','ensembl_gene_id'),
filters = 'ensembl_gene_id',
values = rownames(rnaseq.TPMs$human$TPMs.correct),
mart = ensembl.human
))
rownames(rnaseq.TPMs$human$TPMs.correct) <- keyMap[match(rownames(rnaseq.TPMs$human$TPMs.correct),keyMap$ensembl_gene_id),]$hgnc_symbol
rnaseq.TPMs$human$TPMs.correct <- rnaseq.TPMs$human$TPMs.correct[rownames(rnaseq.TPMs$human$TPMs.correct) != "",]
rnaseq.TPMs$human$TPMs.correct <- rnaseq.TPMs$human$TPMs.correct[(rowSums(is.na(rnaseq.TPMs$human$TPMs.correct)) == 0),]
keyMap <- data.table(getBM(
attributes=c('mgi_symbol','ensembl_gene_id'),
filters = 'ensembl_gene_id',
values = rownames(rnaseq.TPMs$mouse$TPMs.correct),
mart = ensembl.mouse
))
rownames(rnaseq.TPMs$mouse$TPMs.correct) <- keyMap[match(rownames(rnaseq.TPMs$mouse$TPMs.correct),keyMap$ensembl_gene_id),]$mgi_symbol
rnaseq.TPMs$mouse$TPMs.correct <- rnaseq.TPMs$mouse$TPMs.correct[rownames(rnaseq.TPMs$mouse$TPMs.correct) != "",]
rnaseq.TPMs$mouse$TPMs.correct <- rnaseq.TPMs$mouse$TPMs.correct[(rowSums(is.na(rnaseq.TPMs$mouse$TPMs.correct)) == 0),]
#fwrite(rnaseq.TPMs$meta,"~/Desktop/PDX.rnaseq.char.sample_meta.csv")
#fwrite(rnaseq.TPMs$human$humangene.meta,"~/Desktop/PDX.rnaseq.char.human.gene_meta.csv")
fwrite(as.data.table(rnaseq.TPMs$human$TPMs.correct,keep.rownames = "Geneid"),"~/Desktop/PDX.rnaseq.char.human.TPMs_corrected_libsize.csv")
#fwrite(as.data.table(rnaseq.TPMs$data.TPMs,keep.rownames = "Geneid"),"~/Desktop/PDX.rnaseq.char.mouse.TPMs.csv")
fwrite(as.data.table(rnaseq.TPMs$mouse$TPMs.correct,keep.rownames = "Geneid"),"~/Desktop/PDX.rnaseq.char.mouse.TPMs_corrected_libsize.csv")
fname <- paste0(root.dir,"tcga/data/rnaseq.RDS")
if(!file.exists(fname)){
x <- lapply(list.dirs(paste0(root.dir,"tcga/data/rnaseq"),recursive = F),
function(x) fread(cmd=paste("gzip -dc",list.files(x,pattern = "*.gz",full.names = T)[1]))
)
tcga <- as.matrix(do.call(cbind,lapply(x, function(y) y[,2])))
rownames(tcga) <- tstrsplit(unlist(x[[1]][,1]),"\\.")[[1]]
colnames(tcga) <- basename(list.dirs(paste0(root.dir,"tcga/data/rnaseq")))
rm(x)
saveRDS(tcga,fname)
}
tcga <- readRDS(fname)
fname <- paste0(root.dir,"tcga/data/rnaseq.meta.csv")
if(!file.exists(fname)){
library(GenomicDataCommons)
TCGAtranslateID = function(file_ids, legacy = FALSE) {
info = files(legacy = legacy) %>%
filter( ~ file_id %in% file_ids) %>%
select('cases.samples.submitter_id') %>%
results_all()
id_list = lapply(info$cases,function(a) {
a[[1]][[1]][[1]]})
barcodes_per_file = sapply(id_list,length)
return(data.frame(file_id = rep(ids(info),barcodes_per_file),
submitter_id = unlist(id_list)))
}
fnames <- basename(list.dirs(paste0(root.dir,"tcga/data/rnaseq"),recursive = F))
tcga.meta <- data.table(TCGAtranslateID(fnames))
tcga.meta <- tcga.meta[match(fnames,file_id)]
tcga.meta[,bcr_patient_barcode:=substr(submitter_id,0,12)]
clinical <- fread(paste0(root.dir,"tcga/data/clincal_meta_brca.txt"),skip = 1)
tcga.meta <- merge(tcga.meta,clinical,all.x=T,sort=F)
fwrite(tcga.meta,fname)
}
tcga.meta <- fread(fname)
pca.human <- pca.run(tcga,normalise = T)
pca.human <- cbind(pca.human,tcga.meta)
ggpairs(pca.human,columns = 1:3,aes(colour=breast_carcinoma_estrogen_receptor_status))
##PCA (combined) w/ various normalisations
x <- log(calculate_tpm(rnaseq$human$data,rnaseq$human$gene.meta$Length)+0.5)
sequencing <- rnaseq$meta[,merged.lanes]
sample.type <- rnaseq$meta[,sample.type]
set.seed(1); er.type <- kmeans(x[which(rownames(x) == "ENSG00000091831"),],2)$cluster
i <- which(rowSums(rnaseq$human$data) != 0)
x <- rnaseq$human$data[i,]
x <- DGEList(x)
x <- calcNormFactors(x)
x <- voom(x,design = model.matrix(~ 0 + sample.type + er.type + sequencing))
x <- removeBatchEffect(x$E, batch=sequencing, design=model.matrix(~ 0 + sample.type + er.type))
i <- tcga.meta[,breast_carcinoma_estrogen_receptor_status %in% c("Negative","Positive")]
y <- DGEList(tcga[,i])
y <- calcNormFactors(y)
y <- voom(y, design = model.matrix(~ 0 + tcga.meta$breast_carcinoma_estrogen_receptor_status[i]))
x <- cbind(x,y$E[match(rownames(x),rownames(y)),])
rm(y)
db <- c(rep("PDX",nrow(rnaseq$meta)),rep("TCGA",sum(i)))
er <- c(er.type,as.numeric(factor(tcga.meta[i]$breast_carcinoma_estrogen_receptor_status)))
sample.type <- c(rnaseq$meta$sample.type,rep("primary",sum(i)))
pca.human <- pca.run(x,normalise = F,rename = T)
pca.human <- data.table(pca.human,db,er)
ggpairs(pca.human,columns = 1:3,aes(color=db,shape=factor(er)),title = "Without correction")
pca.human <- pca.run(normalize.quantiles(x),normalise = F,rename = T)
pca.human <- data.table(pca.human,db)
ggpairs(pca.human,columns = 1:3,aes(color=db,shape=factor(er)),title = "With quant.norm")
pca.human <- pca.run(removeBatchEffect(x,batch = db,design = model.matrix(~ 0 + sample.type + er)),normalise = F,rename = T)
pca.human <- data.table(pca.human,db)
ggpairs(pca.human,columns = 1:3,aes(color=db,shape=factor(er)),title = "With linear model norm")
pca.human <- pca.run(normalize.quantiles(removeBatchEffect(x,batch = db,design = model.matrix(~ 0 + sample.type + er))),normalise = F,rename = T)
pca.human <- data.table(pca.human,db)
ggpairs(pca.human,columns = 1:3,aes(color=db,shape=factor(er)),title = "With both")
Quantile correction seperates the datasets more than leaving them untouched. Linear model batch correction works well, though some concern remains w.r.t. ER status. Linear+quantile can’t hurt, but hard to justify. :P Xenografts cluster seperately regardless.
sequencing <- rnaseq$meta[,merged.lanes]
sample.type <- rnaseq$meta[,sample.type]
er.type <- rnaseq$meta[,er.inferred]
her2.type <- rnaseq$meta[,HER2.inferred]
i <- which(rowSums(rnaseq$human$data) != 0)
x <- rnaseq$human$data[i,]
x <- DGEList(x)
x <- calcNormFactors(x) # w/o mouse
#x <- calcNormFactors(x,lib.size = rnaseq$meta[,human.library.size + mouse.library.size] ) # w/ mouse
x <- voom(x,design = model.matrix(~ 0 + sample.type + er.type + her2.type + sequencing))
x <- removeBatchEffect(x$E, batch=sequencing, design=model.matrix(~ 0 + sample.type + er.type + her2.type))
i <- tcga.meta[,
breast_carcinoma_estrogen_receptor_status %in% c("Negative","Positive") &
lab_proc_her2_neu_immunohistochemistry_receptor_status %in% c("Negative","Positive")
] #810 samples!
y <- DGEList(tcga[,i])
y <- calcNormFactors(y)
y <- voom(y, design = tcga.meta[i,model.matrix(~ 0 + breast_carcinoma_estrogen_receptor_status + lab_proc_her2_neu_immunohistochemistry_receptor_status)])
combined <- cbind(x,y$E[match(rownames(x),rownames(y)),])
combined <- combined[(rowSums(is.na(combined)) == 0),]
# rm(y)
db <- c(rep("PDX",nrow(rnaseq$meta)),rep("TCGA",sum(i)))
er <- c(er.type,toupper(substr(tcga.meta[i]$breast_carcinoma_estrogen_receptor_status,1,3)))
her2 <- c(her2.type,toupper(substr(tcga.meta[i]$lab_proc_her2_neu_immunohistochemistry_receptor_status,1,3)))
sample.type <- c(rnaseq$meta$sample.type,rep("primary",sum(i)))
library(iC10TrainingData)
data("Map.Exp")
i <- which((rownames(combined) %in% Map.Exp$Ensembl_ID))
j <- which(db == "PDX")
pca.human <- data.table(pca.run(combined[i,j], normalise = F,rename = T, N=NA))
ggpairs(pca.human,columns = 1:3,aes(color=factor(er[j])),title = "Raw")
ggpairs(pca.human,columns = 1:3,aes(color=sample.type[j]),title = "Raw")
pca.human <- data.table(pca.run(
removeBatchEffect(combined[i,j], batch=sample.type[j], design=model.matrix(~ 0 + er[j])),
normalise = F,rename = T, N=NA))
ggpairs(pca.human,columns = 1:3,aes(color=factor(er[j])),title = "Corrected")
ggpairs(pca.human,columns = 1:3,aes(color=sample.type[j]),title = "Corrected")
i <- which((rownames(combined) %in% Map.Exp$Ensembl_ID))
pca.human <- pca.run(combined[i,], normalise = F,rename = T, N=NA)
pca.human <- data.table(pca.human,db)
ggpairs(pca.human,columns = 1:3,aes(color=db),title = "Raw")
ggpairs(pca.human,columns = 1:3,aes(color=factor(er)),title = "Raw")
ggpairs(pca.human,columns = 1:3,aes(color=sample.type),title = "Raw")
pca.human <- pca.run(
removeBatchEffect(combined[i,],batch = db,design = model.matrix(~ 0 + sample.type + er)),
normalise = F,rename = T, N=NA)
pca.human <- data.table(pca.human,db)
ggpairs(pca.human,columns = 1:3,aes(color=db),title = "No sample.type correction")
ggpairs(pca.human,columns = 1:3,aes(color=factor(er)),title = "No sample.type correction")
ggpairs(pca.human,columns = 1:3,aes(color=sample.type),title = "No sample.type correction")
pca.human <- pca.run(
removeBatchEffect(combined[i,], batch=db, batch2=sample.type, design=model.matrix(~ 0 + er)),
normalise = F,rename = T, N=NA)
pca.human <- data.table(pca.human,db)
ggpairs(pca.human,columns = 1:3,aes(color=db),title = "With sample.type correction")
ggpairs(pca.human,columns = 1:3,aes(color=factor(er)),title = "With sample.type correction")
ggpairs(pca.human,columns = 1:3,aes(color=sample.type),title = "With sample.type correction")
x <- combined[rownames(combined) %in% Map.Exp$Ensembl_ID,]
y <- removeBatchEffect(x,batch = db,design = model.matrix(~ 0 + sample.type + er + her2))
Partial NA coefficients for 9 probe(s)
rownames(y) <- rownames(x)
rownames(y) <- Map.Exp[match(rownames(y),Map.Exp$Ensembl_ID),"Gene_symbol"]
y <- y[!is.na(rownames(y)),]
# Run iC10
features <- matchFeatures(Exp=y,Exp.by.feat = "gene")
Found 595 out of 612 Exp features
features <- normalizeFeatures(features, "scale")
res <- iC10(features)
running classifier with only expression...
9 rows with more than 50 % entries missing;
mean imputation used for these rows
12345678910111213141516171819202122232425262728293012345678910Fold 1 :123456789101112131415161718192021222324252627282930
Fold 2 :123456789101112131415161718192021222324252627282930
Fold 3 :123456789101112131415161718192021222324252627282930
Fold 4 :123456789101112131415161718192021222324252627282930
Fold 5 :123456789101112131415161718192021222324252627282930
Fold 6 :123456789101112131415161718192021222324252627282930
Fold 7 :123456789101112131415161718192021222324252627282930
Fold 8 :123456789101112131415161718192021222324252627282930
Fold 9 :123456789101112131415161718192021222324252627282930
Fold 10 :123456789101112131415161718192021222324252627282930
gof <- goodnessOfFit(res)
Correlation between centroids and predicted profiles:
iC1: 0.868
iC2: 0.85
iC3: 0.86
iC4: 0.854
iC5: 0.846
iC6: 0.869
iC7: 0.867
iC8: 0.929
iC9: 0.915
iC10: 0.889
Overall correlation: 0.877
rnaseq$meta$iClust.default <- res$class[1:nrow(rnaseq$meta)]
x <- combined[rownames(combined) %in% Map.Exp$Ensembl_ID,]
y <- removeBatchEffect(x,batch = db,design = model.matrix(~ 0 + sample.type + er + her2))
Partial NA coefficients for 9 probe(s)
y <- y - log(rnaseq$human$gene.meta$Length/1E3)
longer object length is not a multiple of shorter object length
rownames(y) <- rownames(x)
rownames(y) <- Map.Exp[match(rownames(y),Map.Exp$Ensembl_ID),"Gene_symbol"]
y <- y[!is.na(rownames(y)),]
# Run iC10
features <- matchFeatures(Exp=y,Exp.by.feat = "gene")
Found 595 out of 612 Exp features
features <- normalizeFeatures(features, "scale")
res <- iC10(features)
running classifier with only expression...
9 rows with more than 50 % entries missing;
mean imputation used for these rows
12345678910111213141516171819202122232425262728293012345678910Fold 1 :123456789101112131415161718192021222324252627282930
Fold 2 :123456789101112131415161718192021222324252627282930
Fold 3 :123456789101112131415161718192021222324252627282930
Fold 4 :123456789101112131415161718192021222324252627282930
Fold 5 :123456789101112131415161718192021222324252627282930
Fold 6 :123456789101112131415161718192021222324252627282930
Fold 7 :123456789101112131415161718192021222324252627282930
Fold 8 :123456789101112131415161718192021222324252627282930
Fold 9 :123456789101112131415161718192021222324252627282930
Fold 10 :123456789101112131415161718192021222324252627282930
gof <- goodnessOfFit(res)
Correlation between centroids and predicted profiles:
iC1: 0.838
iC2: 0.702
iC3: 0.819
iC4: 0.781
iC5: 0.822
iC6: 0.764
iC7: 0.811
iC8: 0.856
iC9: 0.887
iC10: 0.839
Overall correlation: 0.819
rnaseq$meta$iClust.gl <- res$class[1:nrow(rnaseq$meta)]
y <- removeBatchEffect(combined,batch = db,design = model.matrix(~ 0 + sample.type + er + her2))
Partial NA coefficients for 8964 probe(s)
y <- normalize.quantiles(y)
rownames(y) <- rownames(combined)
rownames(y) <- Map.Exp[match(rownames(y),Map.Exp$Ensembl_ID),"Gene_symbol"]
y <- y[!is.na(rownames(y)),]
# Run iC10
features <- matchFeatures(Exp=y,Exp.by.feat = "gene")
Found 595 out of 612 Exp features
features <- normalizeFeatures(features, "scale")
res <- iC10(features)
running classifier with only expression...
9 rows with more than 50 % entries missing;
mean imputation used for these rows
12345678910111213141516171819202122232425262728293012345678910Fold 1 :123456789101112131415161718192021222324252627282930
Fold 2 :123456789101112131415161718192021222324252627282930
Fold 3 :123456789101112131415161718192021222324252627282930
Fold 4 :123456789101112131415161718192021222324252627282930
Fold 5 :123456789101112131415161718192021222324252627282930
Fold 6 :123456789101112131415161718192021222324252627282930
Fold 7 :123456789101112131415161718192021222324252627282930
Fold 8 :123456789101112131415161718192021222324252627282930
Fold 9 :123456789101112131415161718192021222324252627282930
Fold 10 :123456789101112131415161718192021222324252627282930
gof <- goodnessOfFit(res)
Correlation between centroids and predicted profiles:
iC1: 0.902
iC2: 0.837
iC3: 0.824
iC4: 0.861
iC5: 0.833
iC6: 0.834
iC7: 0.868
iC8: 0.928
iC9: 0.895
iC10: 0.88
Overall correlation: 0.87
rnaseq$meta$iClust.quant <- res$class[1:nrow(rnaseq$meta)]
x <- combined[rownames(combined) %in% Map.Exp$Ensembl_ID,]
y <- removeBatchEffect(x,batch = db, batch2=sample.type, design = model.matrix(~ 0 + er + her2))
rownames(y) <- rownames(x)
rownames(y) <- Map.Exp[match(rownames(y),Map.Exp$Ensembl_ID),"Gene_symbol"]
y <- y[!is.na(rownames(y)),]
# Run iC10
features <- matchFeatures(Exp=y,Exp.by.feat = "gene")
features <- normalizeFeatures(features, "scale")
res <- iC10(features)
gof <- goodnessOfFit(res)
rnaseq$meta$iClust.pdx <- res$class[1:nrow(rnaseq$meta)]
p.IDs <- unique(rnaseq$meta[,.(patient,sample.type)])[,.N,patient][N==2,patient]
x <- rnaseq$meta[patient %in% p.IDs,.(patient,sample.type,iClust.default,iClust.corrected)][order(patient,sample.type)]
x[,.(
iClust.default[1]==iClust.default[-1],
iClust.corrected[1]==iClust.corrected[-1]
),patient][,.(sum(V1)/.N,sum(V2)/.N,.N)]
x[,.(
iClust.default[1]==iClust.default[-1],
iClust.corrected[1]==iClust.corrected[-1]
),patient][,.(sum(V1)/.N,sum(V2)/.N,.N),patient]
x[,.(
iClust.default[1]==iClust.default[-1],
iClust.corrected[1]==iClust.corrected[-1]
),patient][,.(sum(V1)/.N,sum(V2)/.N,.N),patient][,.(mean(V1),mean(V2))]
y <- removeBatchEffect(combined,batch = db,design = model.matrix(~ 0 + sample.type + er))
rownames(y) <- rownames(combined)
keyMap <- data.table(getBM(
attributes=c('entrezgene_id','ensembl_gene_id'),
filters = 'ensembl_gene_id',
values = rownames(y),
mart = ensembl.human
))
Batch submitting query [=>----------------------------------------------------------------------------------------------------] 2% eta: 2m
Batch submitting query [==>---------------------------------------------------------------------------------------------------] 3% eta: 2m
Batch submitting query [===>--------------------------------------------------------------------------------------------------] 4% eta: 2m
Batch submitting query [====>-------------------------------------------------------------------------------------------------] 5% eta: 1m
Batch submitting query [=====>------------------------------------------------------------------------------------------------] 6% eta: 1m
Batch submitting query [======>-----------------------------------------------------------------------------------------------] 7% eta: 1m
Batch submitting query [=======>----------------------------------------------------------------------------------------------] 8% eta: 1m
Batch submitting query [========>---------------------------------------------------------------------------------------------] 9% eta: 1m
Batch submitting query [==========>-------------------------------------------------------------------------------------------] 10% eta: 1m
Batch submitting query [===========>------------------------------------------------------------------------------------------] 11% eta: 1m
Batch submitting query [============>-----------------------------------------------------------------------------------------] 12% eta: 49s
Batch submitting query [=============>----------------------------------------------------------------------------------------] 13% eta: 47s
Batch submitting query [==============>---------------------------------------------------------------------------------------] 14% eta: 45s
Batch submitting query [===============>--------------------------------------------------------------------------------------] 15% eta: 44s
Batch submitting query [================>-------------------------------------------------------------------------------------] 16% eta: 42s
Batch submitting query [=================>------------------------------------------------------------------------------------] 18% eta: 41s
Batch submitting query [==================>-----------------------------------------------------------------------------------] 19% eta: 40s
Batch submitting query [===================>----------------------------------------------------------------------------------] 20% eta: 38s
Batch submitting query [====================>---------------------------------------------------------------------------------] 21% eta: 37s
Batch submitting query [=====================>--------------------------------------------------------------------------------] 22% eta: 36s
Batch submitting query [======================>-------------------------------------------------------------------------------] 23% eta: 35s
Batch submitting query [=======================>------------------------------------------------------------------------------] 24% eta: 35s
Batch submitting query [========================>-----------------------------------------------------------------------------] 25% eta: 34s
Batch submitting query [=========================>----------------------------------------------------------------------------] 26% eta: 34s
Batch submitting query [==========================>---------------------------------------------------------------------------] 27% eta: 33s
Batch submitting query [===========================>--------------------------------------------------------------------------] 28% eta: 32s
Batch submitting query [============================>-------------------------------------------------------------------------] 29% eta: 34s
Batch submitting query [=============================>------------------------------------------------------------------------] 30% eta: 33s
Batch submitting query [===============================>----------------------------------------------------------------------] 31% eta: 32s
Batch submitting query [================================>---------------------------------------------------------------------] 32% eta: 31s
Batch submitting query [=================================>--------------------------------------------------------------------] 33% eta: 31s
Batch submitting query [==================================>-------------------------------------------------------------------] 34% eta: 30s
Batch submitting query [===================================>------------------------------------------------------------------] 35% eta: 30s
Batch submitting query [====================================>-----------------------------------------------------------------] 36% eta: 29s
Batch submitting query [=====================================>----------------------------------------------------------------] 37% eta: 30s
Batch submitting query [======================================>---------------------------------------------------------------] 38% eta: 29s
Batch submitting query [=======================================>--------------------------------------------------------------] 39% eta: 29s
Batch submitting query [========================================>-------------------------------------------------------------] 40% eta: 28s
Batch submitting query [=========================================>------------------------------------------------------------] 41% eta: 28s
Batch submitting query [==========================================>-----------------------------------------------------------] 42% eta: 27s
Batch submitting query [===========================================>----------------------------------------------------------] 43% eta: 26s
Batch submitting query [============================================>---------------------------------------------------------] 44% eta: 26s
Batch submitting query [=============================================>--------------------------------------------------------] 45% eta: 25s
Batch submitting query [==============================================>-------------------------------------------------------] 46% eta: 25s
Batch submitting query [===============================================>------------------------------------------------------] 47% eta: 24s
Batch submitting query [================================================>-----------------------------------------------------] 48% eta: 24s
Batch submitting query [=================================================>----------------------------------------------------] 49% eta: 23s
Batch submitting query [===================================================>--------------------------------------------------] 51% eta: 22s
Batch submitting query [====================================================>-------------------------------------------------] 52% eta: 22s
Batch submitting query [=====================================================>------------------------------------------------] 53% eta: 21s
Batch submitting query [======================================================>-----------------------------------------------] 54% eta: 21s
Batch submitting query [=======================================================>----------------------------------------------] 55% eta: 20s
Batch submitting query [========================================================>---------------------------------------------] 56% eta: 20s
Batch submitting query [=========================================================>--------------------------------------------] 57% eta: 20s
Batch submitting query [==========================================================>-------------------------------------------] 58% eta: 19s
Batch submitting query [===========================================================>------------------------------------------] 59% eta: 19s
Batch submitting query [============================================================>-----------------------------------------] 60% eta: 18s
Batch submitting query [=============================================================>----------------------------------------] 61% eta: 18s
Batch submitting query [==============================================================>---------------------------------------] 62% eta: 17s
Batch submitting query [===============================================================>--------------------------------------] 63% eta: 17s
Batch submitting query [================================================================>-------------------------------------] 64% eta: 16s
Batch submitting query [=================================================================>------------------------------------] 65% eta: 16s
Batch submitting query [==================================================================>-----------------------------------] 66% eta: 15s
Batch submitting query [===================================================================>----------------------------------] 67% eta: 15s
Batch submitting query [====================================================================>---------------------------------] 68% eta: 15s
Batch submitting query [=====================================================================>--------------------------------] 69% eta: 14s
Batch submitting query [=======================================================================>------------------------------] 70% eta: 14s
Batch submitting query [========================================================================>-----------------------------] 71% eta: 13s
Batch submitting query [=========================================================================>----------------------------] 72% eta: 13s
Batch submitting query [==========================================================================>---------------------------] 73% eta: 12s
Batch submitting query [===========================================================================>--------------------------] 74% eta: 12s
Batch submitting query [============================================================================>-------------------------] 75% eta: 11s
Batch submitting query [=============================================================================>------------------------] 76% eta: 11s
Batch submitting query [==============================================================================>-----------------------] 77% eta: 10s
Batch submitting query [===============================================================================>----------------------] 78% eta: 10s
Batch submitting query [================================================================================>---------------------] 79% eta: 9s
Batch submitting query [=================================================================================>--------------------] 80% eta: 9s
Batch submitting query [==================================================================================>-------------------] 81% eta: 8s
Batch submitting query [===================================================================================>------------------] 82% eta: 8s
Batch submitting query [====================================================================================>-----------------] 84% eta: 7s
Batch submitting query [=====================================================================================>----------------] 85% eta: 7s
Batch submitting query [======================================================================================>---------------] 86% eta: 6s
Batch submitting query [=======================================================================================>--------------] 87% eta: 6s
Batch submitting query [========================================================================================>-------------] 88% eta: 5s
Batch submitting query [=========================================================================================>------------] 89% eta: 5s
Batch submitting query [==========================================================================================>-----------] 90% eta: 5s
Batch submitting query [============================================================================================>---------] 91% eta: 4s
Batch submitting query [=============================================================================================>--------] 92% eta: 4s
Batch submitting query [==============================================================================================>-------] 93% eta: 3s
Batch submitting query [===============================================================================================>------] 94% eta: 3s
Batch submitting query [================================================================================================>-----] 95% eta: 2s
Batch submitting query [=================================================================================================>----] 96% eta: 2s
Batch submitting query [==================================================================================================>---] 97% eta: 1s
Batch submitting query [===================================================================================================>--] 98% eta: 1s
Batch submitting query [====================================================================================================>-] 99% eta: 0s
Batch submitting query [======================================================================================================] 100% eta: 0s
rownames(y) <- keyMap[match(rownames(y),keyMap$ensembl_gene_id),]$entrezgene_id
y <- y[!is.na(rownames(y)),]
dmat <- t(y)
dannot <- as.matrix(data.frame("probe"=rownames(y),"EntrezGene.ID" = rownames(y)))
rownames(dannot) <- rownames(y)
pam50.annon <- molecular.subtyping(sbt.model = "pam50",data = dmat,annot = dannot,do.mapping = T)
rnaseq$meta$PAM50 <- pam50.annon$subtype[1:nrow(rnaseq$meta)]
fwrite(rnaseq$meta,"~/Desktop/PDX.rnaseq.char.sample_meta.csv")
#IC10 (subsampling)
#rnaseq.pdx <- subset.PDX(rnaseq,which(rnaseq$meta[,human.library.size>10E6]))
keyMap <- data.table(getBM(
attributes=c('hgnc_symbol','ensembl_gene_id'),
filters = 'ensembl_gene_id',
values = rnaseq$human$gene.meta$Geneid,
mart = ensembl))
subsample.counts <- function(x,totalReads,scaling.factors=NULL){
y <- sapply(1:ncol(x),function(i){
y <- x[,i]
n <- length(y)
if(is.null(scaling.factors)) N <- totalReads
else N <- totalReads * scaling.factors[i]
y <- sample(1:n,N,replace=T,prob=y)
z <- rep(0,n)
for(i in 1:N){
j <- y[i]
z[j] = z[j] + 1
}
z
})
rownames(y) <- rownames(x)
y
}
x <- cbind(rnaseq$human$data,tcga[match(rnaseq$human$gene.meta$Geneid,rownames(tcga)),])
readLimits <- seq(10,2,-2)*1E6
res.pdx <- lapply(readLimits,function(totalReads){
# Subsample and then bind with TCGA
z <- subsample.counts(x,totalReads,scaling.factors)
z <- cbind(z,y)
z <- z[rowSums(z)!=0,]
# Normalise
#z <- t(1E6 * t(z) / (colSums(z) * calcNormFactors(z)))
group <- c(rep("PDX",dim(x)[2]),rep("TCGA",dim(y)[2]))
design <- model.matrix(~ 0 + group)
z <- voom(z,design)
# Run iC10
features <- matchFeatures(Exp=z$E,Exp.by.feat = "gene")
features <- normalizeFeatures(features, "scale")
res <- iC10(features)
gof <- goodnessOfFit(res)
res <- res$posterior[1:ncol(x),]
rownames(res) <- colnames(x)
gof <- gof$indiv[1:ncol(x)]
names(gof) <- colnames(x)
return(list("res"=res,"gof"=gof))
})
names(res.pdx) <- paste0(readLimits/1E6,".milion.reads")
z <- rbindlist(lapply(lapply(res.pdx,"[[","res"), melt),idcol = "total.reads")
z <- z[,.(sample.name=Var1,total.mil.reads=as.integer(tstrsplit(total.reads,"\\.")[[1]]),iC10=Var2,iC10.prob=value)]
z$sample.name <- factor(z$sample.name,levels = sort(unique(as.vector(z$sample.name))))
z$iC10 <- factor(z$iC10)
ggplot(z) + aes(x=sample.name,y=iC10.prob,fill=iC10) + geom_col() + facet_grid(total.mil.reads~.) +
theme_bw(15) + theme(axis.text.x = element_text(angle=-90,hjust = 0)) + xlab("") +
scale_fill_manual(values=c('#FF5500', '#00EE76', '#CD3278','#00C5CD', '#8B0000','#FFFF40', '#0000CD', '#FFAA00', '#EE82EE', '#7D26CD'))
z <- data.table(melt(sapply(res.pdx,"[[","gof")))
z <- z[,.(sample.name=Var1,total.mil.reads=as.integer(tstrsplit(Var2,"\\.")[[1]]),gof=value)]
ggplot(z) + aes(x=total.mil.reads,y=gof,group=sample.name) + geom_line()
x <- rnaseq.pdx$human
keyMap <- data.table(getBM(attributes=c('hgnc_symbol','ensembl_gene_id'),filters = 'ensembl_gene_id', values = rownames(x), mart = ensembl))
rownames(x) <- keyMap[match(rownames(x),ensembl_gene_id),hgnc_symbol]
data(Map.Exp)
y <- x[rownames(x) %in% Map.Exp$Gene_symbol,]
y <- y[rowSums(y)!=0,]
z <- rbind(
data.table(patient=rnaseq.pdx$meta$patient,iC10=apply(res.pdx[[1]]$res,1,which.max),genes="all",pca.run(x)[,1:2]),
data.table(patient=rnaseq.pdx$meta$patient,iC10=apply(res.pdx[[1]]$res,1,which.max),genes="subset",pca.run(y,N=nrow(y))[,1:2])
)
ggplot(z) + aes(x=PC1,y=PC2,colour=factor(iC10),label=patient) + geom_text() + theme_bw(15) + facet_wrap(~genes,scales = "free")